rm(list=ls())

# set up working directory 
setwd("")


library(foreign)
library(arm)

data <- read.dta("daughter_analytic2.dta")


# focus on 1994 data 
D <- subset(data, year==1994)
N <- nrow(D)

# measurement errors 
asample <- rep(NA,N)

levels(D$out_sample)
for (i in 1:8){
	asample[D$out_sample==levels(D$out_sample)[i]] <- i 
}

aS <- rep(0,N)
aS[asample==1] <- 0 
aS[asample> 1&asample<8] <- 0 
aS[asample==8] <- 1 

bS <- rep(0,N)
bS[!is.na(D$fx_bdaughter)] <- 1

D$match.fd <- ifelse(D$fx_bdaughter==D$fx_daughter,0,1)

match.fd <- D$match.fd
f.daughter <- D$fx_bdaughter


# inference issues ----- differences between samples 
D$c_divorced <- as.numeric(D$marital %in% c("widowed","divorced","separated"))
D$d_con <- as.numeric(D$d_conscale100>0)

varlist <- c("d_rep","d_con","fx_daughter","fx_bdaughter",
	"childs","x_hsize",
	"x_oldage","c_age","c_female","c_born","c_educ",
	"c_sei","c_married","c_divorced")
varlist <- rev(varlist)

varname <- c("% Republican","% Conservative","First=Daughter(cohab)","First=Daughter(bio)",
	"Number of children","Household Size",
	"Age of oldest child","Age","Female","Native-born","Years of education",
	"SEI index","Married","Divorced, Separated, Widowed")
varname <- rev(varname)

tab2 <- NULL 
for (var in varlist){
	a1 <- mean(D[,var],na.rm=TRUE)
	a2 <- mean(D[bS==1,var],na.rm=TRUE)
	a3 <- mean(D[aS==1,var],na.rm=TRUE)
	a4 <- mean(D[bS==1&aS==1,var],na.rm=TRUE)
	s1 <- sd(D[,var],na.rm=TRUE)
	tab2 <- rbind(tab2,c(a1,a2,a3,a4,s1))
}

tab3 <- (tab2[,2:4]-tab2[,1])/tab2[,5]

rownames(tab2) <- varname
tab2 <- tab2[rev(1:nrow(tab2)),c(1,5,2,3,4)]
colnames(tab2) <- c("mean","sd","bio","cohab","bio&cohab")
tab2 <- rbind(tab2,c(length(bS),length(bS),sum(bS==1),sum(aS==2),sum(bS==1&aS==2)))
rownames(tab2)[nrow(tab2)] <- "Sample Size"
write.csv(tab2, "tabs1_sample_stats.csv")

tab3.name <- NULL
for (i in 1:length(varlist)){
	tab3.name <- c(tab3.name,c("",varname[i],""))
}

rownames(tab3) <- varlist 
varname[1] <- "Divorced"

pdf("fig1_barplot.pdf",paper="special",height=10,width=7)
	par(mar=c(5.1,10.1,2.1,2.1))
	a<-barplot(t(tab3), beside=TRUE, horiz=TRUE, col=c("blue","red","purple"), 
		xlim=c(-1.5,1.5), yaxt="n", legend=c("Bio-sample","Cohab-sample","Bio&Cohab-sample"),
		args.legend=c(bty="n"))
	axis(2,at=a[1,], varname, las=2, lwd=0)
	abline(h=a[1,]-0.5,lty=2)
dev.off()


# ------------------------------------------------
# predict the first (biological) child's sex 
m0 <- glm(fx_bdaughter~fx_daughter+x_oldage+x_hsize+childs+
	c_female+c_age+c_educ+c_married
	,data=D[aS==1&bS==1,], family=binomial(link="logit"))
summary(m0)

# predict the mismatch 
m1 <- glm(match.fd~fx_daughter+x_oldage+x_hsize+childs+
	c_female+c_age+c_educ+c_married
	,data=D[aS==1&bS==1,], family=binomial(link="logit"))
summary(m1)

# we are only interested in GSS data sets
D1 <- subset(data, dataset=="GSS")

D1.year <- sort(unique(D1$year))
D1.year <- D1.year[6:29]
# in some years prior to 1977, there are not enough information, and so we excluded

sim.coef0 <- array(NA,dim=c(length(D1.year),1000))
sim.coef1 <- array(NA,dim=c(length(D1.year),1000))
obs.coef <- array(NA,dim=c(length(D1.year),3))

list.simulated.f0 <- list()
list.simulated.f1 <- list()

for (i in 1:length(D1.year)){
	yy <- D1.year[i]	
	S <- D1[D1$year==yy&D1$out_sample=="Analytic Sample",]
	
	mat.simulated.f0 <- array(NA,dim=c(nrow(S),1000))
	mat.simulated.f1 <- array(NA,dim=c(nrow(S),1000))
	
	dv <- S$d_repscale
	observed.f <- S$fx_daughter
	obs.coef[i,] <- summary(lm(dv~observed.f))$coefficients[2,c(1,2,4)]
	
	
	s0 <- m0 
	sim.coef <-sim(m0, 1000)
	
		for (k in 1:1000) {
			s0$coefficients <- coef(sim.coef)[k,]
			p0 <- predict(s0, newdata=S,type='response')
			predicted.f <- ifelse(runif(length(p0))<p0,1,0)
			mat.simulated.f0[,k] <- predicted.f 
			sim.coef0[i,k] <- summary(lm(dv~predicted.f))$coefficients[2,1]
		}
	
	
	s1 <- m1
	sim.coef <-sim(m1, 1000)
	
		for (k in 1:1000) {
			s1$coefficients <- coef(sim.coef)[k,]
			p1 <- predict(s1, newdata=S,type='response')
			mismatched.f <- ifelse(runif(length(p1))<p1,1,0)
			predicted.f <- S$fx_daughter-mismatched.f
			mat.simulated.f1[,k] <- predicted.f 
			sim.coef1[i,k] <- summary(lm(dv~predicted.f))$coefficients[2,1]
		}
	
	list.simulated.f1[[i]] <- mat.simulated.f1
	list.simulated.f0[[i]] <- mat.simulated.f0
}

# save the simulated data sets 
save(obs.coef, sim.coef0, sim.coef1, D1.year, 
	list.simulated.f1, list.simulated.f0, file="sim_measurement.RData")


# load the simulated data sets for exact replication of Figure 2
load("sim_measurement.RData")

# here we plot approximate 95% confidence intervals (using 1.96) 
# but we identify statistical significance based on 90% interval (p-value = 0.1)

lci <- obs.coef[,1] - 1.96*obs.coef[,2]
uci <- obs.coef[,1] + 1.96*obs.coef[,2]
sig <- ifelse(obs.coef[,3]< 0.1, 1,2)

s.coef0 <- rowMeans(sim.coef0)
s.lci0 <- apply(sim.coef0, 1, function(x) sort(x)[25]) # 95% interval
s.uci0 <- apply(sim.coef0, 1, function(x) sort(x)[975]) # 95% interval

s.lci01 <- apply(sim.coef0, 1, function(x) sort(x)[50]) # 90% interval
s.uci01 <- apply(sim.coef0, 1, function(x) sort(x)[950]) # 90% interval

s.sig0 <- as.numeric(s.lci01 >0 | s.uci01 < 0)
s.sig0 <- 2-s.sig0

s.coef1 <- rowMeans(sim.coef1)
s.lci1 <- apply(sim.coef1, 1, function(x) sort(x)[25]) # 95% interval
s.uci1 <- apply(sim.coef1, 1, function(x) sort(x)[975]) # 95% interval

s.lci11 <- apply(sim.coef1, 1, function(x) sort(x)[50]) # 90% interval
s.uci11 <- apply(sim.coef1, 1, function(x) sort(x)[950]) # 90% interval

s.sig1 <- as.numeric(s.lci11 >0 | s.uci11 < 0)
s.sig1 <- 2-s.sig1

pdf("fig2_simulated_measurement.pdf", height=5, width=10)
	plot(D1.year,obs.coef[,1],pch=4, ylim=c(-1,1),ylab="Coef. with 95% (confidence) intervals",
		xlab="Survey Year",xaxt="n")
	for (i in 1:length(D1.year)) lines(rep(D1.year[i],2),c(lci[i],uci[i]),lty=sig[i])
	
	points(D1.year+0.3,s.coef0,col="red",pch=19)
	for (i in 1:length(D1.year)) lines(rep(D1.year[i]+0.3,2),c(s.lci0[i],s.uci0[i]),col="red",lty=s.sig0[i])
	
	points(D1.year-0.3,s.coef1,col="blue",pch=15)
	for (i in 1:length(D1.year)) lines(rep(D1.year[i]-0.3,2),c(s.lci1[i],s.uci1[i]),col="blue",lty=s.sig1[i])
	
	abline(h=0,col="gray")	
	axis(1, at=D1.year,labels=D1.year, col.axis="black", las=2,cex.axis=1)
	legend("topright",c("simulated (biological)","observed","simulated (mismatch)"),col=c("red","black","blue"),lty=1,pch=c(19,4,15),bty="n")
dev.off()

